#!/bin/bash
#   MID.format.sh
#
##  DIRECTORY STRUCTURE
##
##  Let /DataYYYY be the directory holding a set of Lexis-Nexis downloads from various
##  sources, e.g.
##     AP_2002      [data]
##     BBC_2002     [data]
##     AFP_2002     [data]
##     Programs     [contains the programs rid.pl, genf.pl, dups.pl, break.pl]
##     Process      [this is where the processed data files end up]
##
##  Each of the data files has the following structure
##
##     April_2002  December_2002  January_2002  June_2002  May_2002  October_2002
##     August_2002  February_2002  July_2002  March_2002  November_2002  September_2002
##
##  Those files, in turn, contain the LN downloads.
##
##  TO RUN SCRIPT:
##
##  1. Make sure that the Process directory exists -- this will be empty before the script
##     is run the first time.
##  2. cd to a data directory, cd AP_2002
##  3. cp ../Programs/MID.format.sh .
##  4. chmod +x MID.format.sh
##  5. ./MID.format.sh fileprefix  where fileprefix is a prefix that will identify the output files.
##
##  Example:
##           cd AP_2002
##           cp ../Programs/MID.format.sh .
##           chmod +x MID.format.sh
##           ./MID.format.sh AP2002
##
##  Script takes a while to run but provides lots of feedback as it is running.
##
##  PROVENANCE:
##  Programmer: Philip A. Schrodt
##              Dept of Political Science
##              Pennsylvania State University
##              227 Pond Laboratory
##              University Park, PA, 16802 U.S.A.
##              http://eventdata.psu.edu
##
##  Report bugs to: schrodt@psu.edu
##
##  Programming supported by National Science Foundation Political Science Program Grant
##  SES-0719634 "Improving the Efficiency of Militarized Interstate Dispute Data Collection using
##  Automated Textual Analysis" and SES-0924240, "MID4: Updating the Militarized Dispute Data Set
##  2002-2010."
##
##  REVISION HISTORY:
##  25-Nov-09:  Initial version
##  29-Jan-10:  Additional components integrated
##  04-Mar-10:  Added initial directory and file checks
##
##  Programming supported by National Science Foundation Grant SES-0924240,
##  "MID4: Updating the Militarized Dispute Data Set 2002-2010."



# check that the directory structure is correct

if [ ! -e "../Process" ]
then
  echo "This script requires a folder named \"Process\": see instructions"
  echo "Script is exiting without making changes. Ciao."
  exit
fi

# check against possible overwriting of files

testname="${1}.0"
if [ -e "../Process/$testname" ]
then
  echo "A file with the name \"${testname}\" already exists in the directory \"Process\""
  echo "Please change the prefix or remove that file if you intend to overwrite"
  echo "any previous files which have this prefix."
  echo "Script is exiting without making changes. Ciao."
  exit
fi




cp ../Programs/*.pl .

echo Processing files
cd January_2009/
ls > ../files.list
mv * ..
cd ..
cd February_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd March_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd April_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd May_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd June_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd July_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd August_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd September_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd October_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd November_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt
cd December_2009/
ls > ../scr.ls.txt
mv * ..
cd ..
cat scr.ls.txt >> files.list
rm scr.ls.txt

perl genf.pl files.list
#
echo Returning files to folders
#
mv Jan*.TXT January_2009
mv Jan*.txt January_2009
mv Jan*.Txt January_2009
mv Feb*.TXT February_2009
mv Feb*.txt February_2009
mv Feb*.Txt February_2009
mv Mar*.TXT March_2009
mv Mar*.txt March_2009
mv Mar*.Txt March_2009
mv Apr*.TXT April_2009
mv Apr*.txt April_2009
mv Apr*.Txt April_2009
mv May*.TXT May_2009
mv May*.txt May_2009
mv May*.Txt May_2009
mv Jun*.TXT June_2009
mv Jun*.txt June_2009
mv Jun*.Txt June_2009
mv Jul*.TXT July_2009
mv Jul*.txt July_2009
mv Jul*.Txt July_2009
mv Aug*.TXT August_2009
mv Aug*.txt August_2009
mv Aug*.Txt August_2009
mv Sep*.TXT September_2009
mv Sep*.txt September_2009
mv Sep*.Txt September_2009
mv Oct*.TXT October_2009
mv Oct*.txt October_2009
mv Oct*.Txt October_2009
mv Nov*.TXT November_2009
mv Nov*.txt November_2009
mv Nov*.Txt November_2009
mv Dec*.TXT December_2009
mv Dec*.txt December_2009
mv Dec*.Txt December_2009
#
# run the reformatting programs
#

 echo Duplication detection with dups.pl
 perl dups.pl files.list  # input is 'filename'.genfed
 echo Breaking files into 5000 story blocks with break.pl
## Note: $* is a string specified in the pbs script that is used in break.pl
## Note: $* allows for a unique filename for each of the 4 search strings
 perl break.pl files.list 5000 $* # input is 'filename'.NoDupStories
 echo Moving files into /Process directory
## Note: All files have been moved to the Process directory
 mv $*  ../Process
echo Script is complete
